# Clear all
rm(list = ls()) 
set.seed(47474747)
#Load Packages
library(lpSolveAPI)
library(readstata13)
library(dplyr)
library(ggplot2)
library(AER)
library(sandwich)
library(haven)
library(lmtest)
library(boot)
rm(list =ls())

# Set Path <-- Direct to replication data folder
setwd("/Users/vedantvohra/Library/CloudStorage/Dropbox/ACR Bounds/Replication Kit")

#Prepare Data 
original_data <- read_dta("Output/oregon_ed_boot.dta") 

# Load Function
source("Functions/cce.R")

model <- as.formula("ed_visit_30sep2009 ~ ohp_all_mo_30sep2009 + ed_visit_09mar2008 + nnnnumhh_li_2 + nnnnumhh_li_3 |  treatment +  ed_visit_09mar2008 + nnnnumhh_li_2 + nnnnumhh_li_3")

n_all <- 24646
n_zero <- 16930
n_nonzero <- 7716 


wrapper <- function(data, index, subsampleSize,m_n){
  if(m_n == 1){
  main_data <- data[sample(1:nrow(data),size=subsampleSize,replace=TRUE),]
  }else {
  main_data <- data[index,]  
  }
  
  # main_data <- original_data
  
  data_zero <- main_data %>% filter(zero == 1)
  data_nonzero <- main_data %>% filter(zero == 0)
  
  #Regression
  iv_zero <- ivreg(model,
                   data = data_zero)
  
  iv_nonzero <- ivreg(model,
                      data = data_nonzero)
  
  iv_res_zero <- coeftest(iv_zero) 
  iv_res_nonzero <- coeftest(iv_nonzero) 
  
  oregon_acr_zero <- iv_res_zero[2,1]*100
  oregon_acr_nonzero <- iv_res_nonzero[2,1]*100
  
  weight_vars <- list()
  weights <- c()
  
  for (i in 1:19) {
    nam <- paste("covered_months_",i,"_or_more",sep = "")
    weight_vars[[nam]] <- assign(nam, if_else(data_zero$ohp_all_mo_30sep2009 >= i, 1, 0))
    model <- as.formula(paste("weight_vars[[i]] ~ data_zero$treatment + data_zero$nnnnumhh_li_2 + data_zero$nnnnumhh_li_3 + data_zero$ed_visit_09mar2008"))
    model <- lm(model)
    weights <- c(weights,coeftest(model)[2,1])
  }
  
  weights_zero <-weights/sum(weights)
  
  weight_vars <- list()
  weights <- c()
  
  for (i in 1:19) {
    nam <- paste("covered_months_",i,"_or_more",sep = "")
    weight_vars[[nam]] <- assign(nam, if_else(data_nonzero$ohp_all_mo_30sep2009 >= i, 1, 0))
    model <- as.formula(paste("weight_vars[[i]] ~ data_nonzero$treatment + data_nonzero$nnnnumhh_li_2 + data_nonzero$nnnnumhh_li_3 + data_nonzero$ed_visit_09mar2008"))
    model <- lm(model)
    weights <- c(weights,coeftest(model)[2,1])
  }
  
  weights_nonzero <-weights/sum(weights)
  
  # LP Result assuming concavity
  results_conc_zero <- cce(1,weights_zero,oregon_acr_zero)
  results_conc_nonzero <- cce(1,weights_nonzero,oregon_acr_nonzero)
  
  cce_lb_all <- (n_nonzero*results_conc_nonzero$`Lower Bound` + n_zero*results_conc_zero$`Lower Bound` )/n_all
  cce_ub_all <- (n_nonzero*results_conc_nonzero$`Upper Bound` + n_zero*results_conc_zero$`Upper Bound` )/n_all
  
  results_conc_oregon <- c(cce_lb_all,cce_ub_all)
  
  return(results_conc_oregon)
}

N <- nrow(original_data) ##sample size of dataset

q <- 0.95 ##inputs for determining candidate values for m 
k <- 500 

m.candidates <- unique(ceiling(N * q^(0:k)))
m.candidates <- m.candidates[m.candidates>=5000] ##these are the candidate values for m

#This is the number of replicated datasets to be generated for each candidate value m.
#Let it be a large number.

bootrep <- 1000 

#This is the number of replicated datasets to be generated
#for the value m that is selected. 

bootrep.chosenM <- 1000

##STEP 1: Selecting m
numCandidates <- length(m.candidates)

matLB <- matrix(NA, nrow = numCandidates, ncol = bootrep) 
matUB <- matrix(NA, nrow = numCandidates, ncol = bootrep)

for (l in 1:numCandidates){
  boot.obj <- boot(data = original_data, statistic= wrapper, R=bootrep, subsampleSize = m.candidates[l], m_n = 1)
  matLB[l,] <- t(sort(boot.obj$t[,1], na.last = TRUE))
  matUB[l,] <- t(sort(boot.obj$t[,2], na.last = TRUE))
}

##This loop gets the lower and upper bound estimates of the 5,000 bootstrap replicated datasets. The
##results are stored in matLB and matUB, respectively.

##For each candidate value, there is a row in matLB (matUB) with the lower (upper) bound estimates for that candidate value.
##The lower (upper) bound estimates are ordered from smallest to largest, with any NA's at the end.

##Now we select the value m for the lower bound
##Label each row with the candidate value to which it corresponds.
matLB <- cbind(m.candidates, matLB) 

##get rid of candidate m's with any NA's (m was too small, nT or nC = 0) 
matLB <- matLB[complete.cases(matLB),] 

##these are the candidates remaining after excluding those with NA's
m.candidatesLB <- matLB[,1] 

##Take off the labels. We don't want these when choosing m.
matLB <- matLB[,-1] 

##Take the difference between adjacent rows (row 1 - row 2, row2 - row3, row3 - row4, ...)
temp1 <- matLB[1:(nrow(matLB)-1),]-matLB[2:nrow(matLB),] 

##Take the absolute value (|row1 - row2|, |row2 - row3|, |row3 - row4|, ...)
temp1 <- abs(temp1) 

##Find the maximum (max{|row1 - row2|}, max{|row2 - row3|}, max{|row3 - row4}, ...)
temp1 <- apply(temp1, 1, max) 

m.chosenLB <- m.candidatesLB[min(which(temp1==min(temp1)))] 
##Choose the value m with the smallest max absolute difference. 
##If multiple m attain the smallest max absolute difference, choose the largest one.
##m.chosenLB is the value m selected for lower bound

##Separately, but using the same procedure, we select the value m for the upper bound
matUB <- cbind(m.candidates, matUB)
matUB <- matUB[complete.cases(matUB),] 
m.candidatesUB <- matUB[,1]
matUB <- matUB[,-1]
temp1 <- matUB[1:(nrow(matUB)-1),]-matUB[2:nrow(matUB),]
temp1 <- abs(temp1)
temp1 <- apply(temp1, 1, max)
m.chosenUB <- m.candidatesUB[min(which(temp1==min(temp1)))] 

##STEP 2: Run bootstrap using selected values of m##
boot.obj <- boot(data = original_data, statistic= wrapper, R=bootrep.chosenM, subsampleSize = m.chosenLB,m_n = 1)
LB.CI_mn <- c(quantile(boot.obj$t[,1], probs = c(.025,0.975)), m.chosenLB)
UB.CI_mn <- c(quantile(boot.obj$t[,2], probs = c(.025,0.975)), m.chosenUB)

#Final Results
results <- round(c(LB.CI_mn[1],  UB.CI_mn[2]),3)

results
